{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "48293f37",
   "metadata": {},
   "source": [
    "### Analysis of rare SV carrier misexpression events"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "8cbe21f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.style as style\n",
    "from matplotlib import pyplot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "530f9123",
   "metadata": {},
   "outputs": [],
   "source": [
    "wkdir = \"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v2/\"\n",
    "wkdir_path = Path(wkdir)\n",
    "express_carrier_dir = wkdir_path.joinpath(\"4_vrnt_enrich/sv_count_carriers/gene_body/200kb_window/express_carrier_info\")\n",
    "\n",
    "out_dir_path = wkdir_path.joinpath(\"4_vrnt_enrich/sv_carrier_metrics\")\n",
    "out_dir_path.mkdir(parents=True, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "491fb212",
   "metadata": {},
   "outputs": [],
   "source": [
    "CHROMOSOMES = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6',\n",
    "               'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12',\n",
    "               'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18',\n",
    "               'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY',\n",
    "               ]\n",
    "af_lower, af_upper = (0, 0.01)\n",
    "tpm_cutoff = 0.5\n",
    "z_score_cutoff = 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "814a4759",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "chr1\n",
      "chr2\n",
      "chr3\n",
      "chr4\n",
      "chr5\n",
      "chr6\n",
      "chr7\n",
      "chr8\n",
      "chr9\n",
      "chr10\n",
      "chr11\n",
      "chr12\n",
      "chr13\n",
      "chr14\n",
      "chr15\n",
      "chr16\n",
      "chr17\n",
      "chr18\n",
      "chr19\n",
      "chr20\n",
      "chr21\n",
      "chr22\n"
     ]
    }
   ],
   "source": [
    "misexp_rare_carriers_df_list = []\n",
    "express_carrier_path = Path(express_carrier_dir)\n",
    "for chrom in CHROMOSOMES[:22]: \n",
    "    print(chrom)\n",
    "    chrom_express_carrier = express_carrier_path.joinpath(f\"{chrom}_express_carrier_info.tsv\")\n",
    "    chrom_express_carrier_df = pd.read_csv(chrom_express_carrier, sep=\"\\t\")\n",
    "    misexp_rare_carriers_df = chrom_express_carrier_df[(chrom_express_carrier_df.AF >= af_lower) & \n",
    "                         (chrom_express_carrier_df.AF < af_upper) & \n",
    "                         (chrom_express_carrier_df.TPM > tpm_cutoff) & \n",
    "                         (chrom_express_carrier_df[\"z-score\"] > z_score_cutoff) &\n",
    "                         (chrom_express_carrier_df.genotype.isin([\"(0, 1)\", \"(1, 1)\"]))\n",
    "                        ].copy()\n",
    "    misexp_rare_carriers_df_list.append(misexp_rare_carriers_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "cbd364d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_chrom_misexp_carriers_df = pd.concat(misexp_rare_carriers_df_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "7d059e48",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of misexpression carriers: 312\n",
      "Total number of variants affecting one gene: 297\n",
      "Percentage of variants affecting one gene: 0.9519230769230769\n"
     ]
    }
   ],
   "source": [
    "# SVs affecting multiple genes \n",
    "vrnt_misexp_gene_carrier_df = all_chrom_misexp_carriers_df[[\"vrnt_id\", \"gene_id\"]].drop_duplicates()\n",
    "vrnt_misexp_gene_count_df = pd.DataFrame(vrnt_misexp_gene_carrier_df.groupby(\"vrnt_id\", as_index=False)[\"gene_id\"].count()).rename(columns={\"gene_id\": \"gene_count\"})\n",
    "total_vrnt_misexp_carrier = vrnt_misexp_gene_count_df.shape[0]\n",
    "print(f\"Total number of misexpression carriers: {total_vrnt_misexp_carrier}\")\n",
    "total_vrnt_misexp_carrier_one_gene = vrnt_misexp_gene_count_df[vrnt_misexp_gene_count_df.gene_count == 1].shape[0]\n",
    "print(f\"Total number of variants affecting one gene: {total_vrnt_misexp_carrier_one_gene}\")\n",
    "print(f\"Percentage of variants affecting one gene: {total_vrnt_misexp_carrier_one_gene/total_vrnt_misexp_carrier}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "6375b5fd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# write to file for plot \n",
    "vrnt_misexp_gene_count_path = out_dir_path.joinpath(\"vrnt_misexp_gene_count.tsv\")\n",
    "vrnt_misexp_gene_count_df.to_csv(vrnt_misexp_gene_count_path, sep=\"\\t\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "4e83a330",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total samples with misexpression carriers: 206\n",
      "Total samples with one misexpression carrier: 183\n",
      "Total samples with more than one misexpression carrier: 23\n",
      "Percentage of samples  affecting one gene: 0.11165048543689321\n"
     ]
    }
   ],
   "source": [
    "# samples containing many misexpression events \n",
    "smpl_misexp_gene_carrier_df = all_chrom_misexp_carriers_df[[\"rna_id\", \"gene_id\"]].drop_duplicates()\n",
    "smpl_misexp_gene_carrier_count_df = pd.DataFrame(smpl_misexp_gene_carrier_df.groupby(\"rna_id\", as_index=False)[\"gene_id\"].count()).rename(columns={\"gene_id\": \"gene_count\"})\n",
    "total_smpls_misexp_carrier = smpl_misexp_gene_carrier_count_df.shape[0]\n",
    "print(f\"Total samples with misexpression carriers: {total_smpls_misexp_carrier}\")\n",
    "total_smpl_misexp_carrier_one_gene = smpl_misexp_gene_carrier_count_df[smpl_misexp_gene_carrier_count_df.gene_count == 1].shape[0]\n",
    "print(f\"Total samples with one misexpression carrier: {total_smpl_misexp_carrier_one_gene}\")\n",
    "print(f\"Total samples with more than one misexpression carrier: {(total_smpls_misexp_carrier - total_smpl_misexp_carrier_one_gene)}\")\n",
    "print(f\"Percentage of samples  affecting one gene: {(total_smpls_misexp_carrier - total_smpl_misexp_carrier_one_gene)/total_smpls_misexp_carrier}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "7a18ce37",
   "metadata": {},
   "outputs": [],
   "source": [
    "# write to file for plot \n",
    "smpl_misexp_gene_carrier_count_path = out_dir_path.joinpath(\"smpl_misexp_gene_carrier_count.tsv\")\n",
    "smpl_misexp_gene_carrier_count_df.to_csv(smpl_misexp_gene_carrier_count_path, sep=\"\\t\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3ec5a558",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
